In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import  train_test_split 
import seaborn as sns
import plotly.express as px
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from itertools import combinations

Load the data¶

In [2]:
churn = pd.read_csv('df2_encoded.csv')
df = pd.DataFrame(churn)
#df.head(100)
# gathering descriptive statistics
df.describe(include='all')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1129 entries, 0 to 1128
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   stag          1129 non-null   float64
 1   event         1129 non-null   int64  
 2   gender        1129 non-null   int64  
 3   age           1129 non-null   float64
 4   industry      1129 non-null   int64  
 5   profession    1129 non-null   int64  
 6   traffic       1129 non-null   int64  
 7   coach         1129 non-null   int64  
 8   head_gender   1129 non-null   int64  
 9   greywage      1129 non-null   int64  
 10  way           1129 non-null   int64  
 11  extraversion  1129 non-null   float64
 12  independ      1129 non-null   float64
 13  selfcontrol   1129 non-null   float64
 14  anxiety       1129 non-null   float64
 15  novator       1129 non-null   float64
dtypes: float64(7), int64(9)
memory usage: 141.3 KB
In [3]:
df.shape
Out[3]:
(1129, 16)
In [4]:
df.columns
Out[4]:
Index(['stag', 'event', 'gender', 'age', 'industry', 'profession', 'traffic',
       'coach', 'head_gender', 'greywage', 'way', 'extraversion', 'independ',
       'selfcontrol', 'anxiety', 'novator'],
      dtype='object')

Modeling the Data¶

In [5]:
# looking for imbalance in the data set
df['event'].value_counts()
Out[5]:
event
1    571
0    558
Name: count, dtype: int64
In [6]:
# class imbalance percentage
print("The percentage of majority class is:", round((len(df[df['event']==1])/df.shape[0])* 100, 2),'%')
The percentage of majority class is: 50.58 %
  • I will be using a logistic regression algorithm to model the data. For 'event', we have 50.6 % left the company and 49.4 % decided to stay. Therefore,we have a balanced class.
In [7]:
# dropping the duplicates and save the dataset in a new variable
#df_new = df.drop_duplicates()
#df_new.reset_index(inplace=True, drop=True)
#df_new.head(10)

Data Visualizations¶

In [8]:
df.hist(figsize=(14,9), xrot=45)
plt.show
Out[8]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image

Correlation Matrix and Heatmap¶

In [9]:
# creating a correlation matrix
corr=df.corr()
#corr
# heatmap of the correlation matrix

df_corr = df.corr().round(2)
fig = px.imshow(df_corr, text_auto = True, labels=dict(color="Correlation"), width=600, height=600)
fig.show()
  • From the correlations alone, we see very weak correlation to 'event' as all columns have very weak to no linear correlations, meaning we cant really extract much information.
  • Lets check whats the distribution of employee that left the company or not
In [10]:
fig = px.pie(df, "event", color='event', hole=.3)
fig.show()
  • Checking if experience (time) is a factor that affects employee from resigning
In [11]:
fig = px.histogram(df, x="stag", color='event', marginal='box', barmode='group')
fig.show()
  • Checking if age is a factor that affects employees from leaving the company
In [12]:
fig = px.histogram(df, x="age", color='event', marginal='box', barmode='group')
fig.show()
  • Age does not have strong correaltion to employee resigning

A boxplot and a regplot of employee retention based on the anxiety level.¶

In [13]:
fig, axes =plt.subplots(1,2, figsize=(12,4))
sns.boxplot(x='event',y='anxiety', showfliers=False, data=df, ax=axes[0])
axes[0].set_xlabel("Employee resigned from  the company or not")
axes[0].set_ylabel("Anxiety level")
axes[0].set_title("Boxplot for Anxiety Level vs employee left the company or not")

sns.regplot(x='event',y='anxiety', data=df, ax=axes[1])
axes[1].set_xlabel("Employee left the company or not")
axes[1].set_ylabel("Anxiety level")
axes[1].set_title("Linear Regression Model Fit")

plt.show()
No description has been provided for this image
In [14]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'stag' and 'event' are columns in df
if 'stag' in df.columns:
#    df['stag'] = df['stag'] / 12
    df['stag'] = df['stag'] 
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Boxplot for 'stag'
sns.boxplot(x='event', y='stag', showfliers=False, data=df, ax=axes[0])
axes[0].set_xlabel("Employee resigned from the company or not")
axes[0].set_ylabel("selfcontrol")
axes[0].set_title("Boxplot for stag vs employee left the company or not")

# Linear regression plot for 'stag/12'
sns.regplot(x='event', y='stag', data=df, ax=axes[1])
axes[1].set_xlabel("Employee left the company or not")
axes[1].set_ylabel("stag [year]")
axes[1].set_title("Linear Regression Model Fit")

plt.show()
No description has been provided for this image
  • From the boxplot we can see the employees who've left has a lower tenure than the employees who've stayed. The employees who work in the company longer are more likely to stay.
In [15]:
sns.countplot(data=df, x='profession', hue ='event')
plt.xlabel("profession")
plt.title("profession vs. If the employee left the company")
plt.legend(title = "Employee left?")
plt.xticks(rotation = 90)
plt.show()
No description has been provided for this image

Preparing Data for Modelling¶

  • Our variable is 'event' i.e. we are trying to find out whether or not an

employee will leave the company, based on other independent variables. From the heatmap we can see, the correlation with other variable is weak.

  • The dependent variable 'event' is binary i.e. it has only two results 0 (if the employee didn't leave) and 1 (if the employee left).

Creating Classification model and fitting the data¶

We will use Logistic Regression Model¶

In [16]:
# 'event' is the target variable that we want to predict
target = df['event']
# Drop the target variable from the features
features = df.drop('event', axis=1)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
In [17]:
print(f'X_train : {X_train.shape}')
print(f'y_train : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_test : {y_test.shape}')
X_train : (903, 15)
y_train : (903,)
X_test : (226, 15)
y_test : (226,)

Initialize the Logistic Regression Model- No scaling¶

In [18]:
# Initialize the LogReg classifier
logreg1 = LogisticRegression(max_iter = 1000, random_state = 42)
# Fit the model on the training data
logreg1.fit(X_train, y_train)


# Make predictions on the training data
train_predictions = logreg1.predict(X_train)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_train, train_predictions)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

plt.figure(figsize=(2, 1))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Get the classification report for the training data
class_report_train = classification_report(y_train, train_predictions)
print("Classification Report for Training Data:")
print(class_report_train)

# If you want to see the accuracy score for the training data
accuracy_score_train = logreg1.score(X_train, y_train)
print(f"Accuracy on Training Data: {accuracy_score_train}")

# If you want to see the accuracy score for the testing data
accuracy_score_test= logreg1.score(X_test, y_test)
print(f"Accuracy on Testing Data: {accuracy_score_test}")
Confusion Matrix:
[[255 195]
 [185 268]]
No description has been provided for this image
Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.58      0.57      0.57       450
           1       0.58      0.59      0.59       453

    accuracy                           0.58       903
   macro avg       0.58      0.58      0.58       903
weighted avg       0.58      0.58      0.58       903

Accuracy on Training Data: 0.5791805094130675
Accuracy on Testing Data: 0.5575221238938053

Using Traning data, with Standard Scaling¶

In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train2 = scaler.fit_transform(X_train)
X_test2 = scaler.fit_transform(X_test)

# Initialize the KNN classifier
logreg2 = LogisticRegression(max_iter = 1000, random_state = 42)

# Fit the model on the training data
logreg2.fit(X_train2, y_train)


# Make predictions on the training data
train_predictions2 = logreg2.predict(X_train2)

# Calculate the confusion matrix
conf_matrix2 = confusion_matrix(y_train, train_predictions2)

# Print the confusion matrix
print("Confusion Matrix2:")
print(conf_matrix2)


# Get the classification report for the training data
class_report_train2 = classification_report(y_train, train_predictions2)
print("Classification Report for Training Data:")
print(class_report_train2)

# If you want to see the accuracy score for the training data
accuracy_score_train2 = logreg2.score(X_train2, y_train)
print(f"Accuracy on Training Data: {accuracy_score_train2}")

# If you want to see the accuracy score for the testing data
accuracy_score_test2= logreg2.score(X_test2, y_test)
print(f"Accuracy on Testing Data: {accuracy_score_test2}")
Confusion Matrix2:
[[255 195]
 [186 267]]
Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.58      0.57      0.57       450
           1       0.58      0.59      0.58       453

    accuracy                           0.58       903
   macro avg       0.58      0.58      0.58       903
weighted avg       0.58      0.58      0.58       903

Accuracy on Training Data: 0.5780730897009967
Accuracy on Testing Data: 0.5530973451327433

Using only with Traning data, with MinMax Scaling¶

In [20]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train3 = scaler.fit_transform(X_train)
X_test3 = scaler.fit_transform(X_test)

# Initialize the KNN classifier
logreg3 = LogisticRegression(max_iter = 1000, random_state = 42)

# Fit the model on the training data
logreg3.fit(X_train3, y_train)


# Make predictions on the training data
train_predictions3 = logreg3.predict(X_train3)

# Calculate the confusion matrix
conf_matrix3 = confusion_matrix(y_train, train_predictions3)

# Print the confusion matrix
print("Confusion Matrix3:")
print(conf_matrix3)


# Get the classification report for the training data
class_report_train3 = classification_report(y_train, train_predictions3)
print("Classification Report for Training Data:")
print(class_report_train3)

# If you want to see the accuracy score for the training data
accuracy_score_train3 = logreg3.score(X_train3, y_train)
print(f"Accuracy on Training Data: {accuracy_score_train3}")

# If you want to see the accuracy score for the testing data
accuracy_score_test3= logreg3.score(X_test3, y_test)
print(f"Accuracy on Testing Data: {accuracy_score_test3}")
Confusion Matrix3:
[[264 186]
 [188 265]]
Classification Report for Training Data:
              precision    recall  f1-score   support

           0       0.58      0.59      0.59       450
           1       0.59      0.58      0.59       453

    accuracy                           0.59       903
   macro avg       0.59      0.59      0.59       903
weighted avg       0.59      0.59      0.59       903

Accuracy on Training Data: 0.5858250276854928
Accuracy on Testing Data: 0.5575221238938053

Start with Cross_Val_Score (3 folder is best) with MinMaxScaler¶

In [21]:
from sklearn.model_selection import cross_val_score

logreg4 = LogisticRegression(max_iter = 1000, random_state = 42)

for i in range(2, 11):
    scores = cross_val_score(logreg4, X_train3, y_train, cv=i, scoring='accuracy') #with Standard Scaling
    mean_score = scores.mean()
    std_dev = scores.std()
    print(f"Mean accuracy for {i} folds: {mean_score:.3f}")
    print(f"Standard deviation for {i} folds: {std_dev:.3f}")
    print('*'*50)
Mean accuracy for 2 folds: 0.540
Standard deviation for 2 folds: 0.007
**************************************************
Mean accuracy for 3 folds: 0.555
Standard deviation for 3 folds: 0.003
**************************************************
Mean accuracy for 4 folds: 0.553
Standard deviation for 4 folds: 0.010
**************************************************
Mean accuracy for 5 folds: 0.552
Standard deviation for 5 folds: 0.038
**************************************************
Mean accuracy for 6 folds: 0.546
Standard deviation for 6 folds: 0.011
**************************************************
Mean accuracy for 7 folds: 0.538
Standard deviation for 7 folds: 0.045
**************************************************
Mean accuracy for 8 folds: 0.543
Standard deviation for 8 folds: 0.034
**************************************************
Mean accuracy for 9 folds: 0.544
Standard deviation for 9 folds: 0.042
**************************************************
Mean accuracy for 10 folds: 0.550
Standard deviation for 10 folds: 0.058
**************************************************
In [ ]:
 
  • here start with Cross_Val_Score with MinMax Scaling

Use with GridSearchCV with Standard Scaling¶

In [22]:
param_grid = {
    'C': [ 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']}

# Instantiate logistic regression model
logistic_model = LogisticRegression(max_iter = 1000, random_state = 42)

# Create GridSearchCV object
grid_search = GridSearchCV(logistic_model, param_grid, cv =3, 
                           scoring = ['accuracy', 'recall', 'precision', 'f1'], refit = 'recall')


logistic_model
Out[22]:
LogisticRegression(max_iter=1000, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=1000, random_state=42)
In [23]:
import warnings
warnings.filterwarnings('ignore', module='sklearn')

# Fit the grid search to the data
grid_search.fit(X_train3, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best model
logistic_model_best_estimater = grid_search.best_estimator_
Best Hyperparameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
In [24]:
logistic_model_best_estimater
Out[24]:
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')
In [25]:
logistic_model_best_estimater.fit(X_train3,y_train)
Out[25]:
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=1, max_iter=1000, random_state=42, solver='saga')
In [26]:
print (f'Train Accuracy - : {logistic_model_best_estimater.score(X_train3, y_train):.3f}') #Mit Sklaierung
print (f'Test Accuracy - : {logistic_model_best_estimater.score(X_test3,y_test):.3f}') # Mit Sklaierun
Train Accuracy - : 0.586
Test Accuracy - : 0.558
  • Feature Improtance
  • A function is defined below to gererate a sorted coefficient table.
In [27]:
 
 # Write a function to generate coefficient table
def get_coefficient_table(model, training_set):
    '''
    Generate a coefficient table for a regression model.

    Parameters:
    - model (object): The trained model for which coefficients are to be analyzed.
    - training_set (DataFrame): The DataFrame containing the features used for training the model.

    Returns:
    DataFrame: A table containing feature names and their corresponding coefficients.
    The table is sorted by the magnitude (absolute value) of the coefficients in descending order.
    '''
    # Create coefficient table
    coefficient_table = pd.DataFrame(training_set.columns, columns = ['Features']).copy()
    coefficient_table.insert(len(coefficient_table.columns), "Coefficients", model.coef_.transpose())

    # Sort coefficient table by magnitude (absolute value) of the coefficients
    coefficient_table['Absolute_Coefs'] = coefficient_table['Coefficients'].abs()
    sorted_coefficient_table = coefficient_table.sort_values(by = 'Absolute_Coefs', ascending=False, ignore_index=True)
    sorted_coefficient_table = sorted_coefficient_table.drop(columns = ['Absolute_Coefs'])

    return sorted_coefficient_table
  • Now we can have a coefficient table sorted by the magnitude of each feature's impact on employee turnover.
In [28]:
 # Create a coefficient table from the selected model
get_coefficient_table(logistic_model_best_estimater, X_train)
Out[28]:
Features Coefficients
0 age -1.034868
1 anxiety -0.740016
2 stag -0.509231
3 way 0.398189
4 traffic 0.394109
5 selfcontrol -0.309708
6 independ 0.309617
7 greywage -0.296845
8 head_gender 0.217827
9 coach -0.176564
10 profession -0.153280
11 extraversion -0.107094
12 industry 0.044168
13 novator 0.040276
14 gender 0.039417
  • The table shows that key features influencing employee turnover include age, anxiety, stag, way, traffic. Additionally, self control independent and greywage.
In [ ]: